Candida glabrata strains:
Treated with voriconazole (VCZ / VORI), fluconazole (FCZ / FLUC) or DMSO (solvent control).
Sequencing FASTQ files were aligned to appropriate genomes using HISAT2 and quantified using featureCounts.
MultiQC results, including quality control of raw reads (FastQC) and alignment are here:
featureCount results, aggregated by strain, are below:
counts_all_CBS138 = read.table("../data/featureCounts/Candida_glabrata_CBS138/CBS138_all_conditions_featureCounts.tab", header=TRUE)
counts_all_BG2 = read.table("../data/featureCounts/Candida_glabrata_BG2/BG2_all_conditions_featureCounts.tab", header=TRUE)
counts_rpm_CBS138 = read.table("../data/featureCounts/Candida_glabrata_CBS138/CBS138_all_conditions_rpm.tab", header=TRUE)
counts_rpm_BG2 = read.table("../data/featureCounts/Candida_glabrata_BG2/BG2_all_conditions_rpm.tab", header=TRUE)
head(counts_all_CBS138) # CBS138, raw
## Geneid CBS138_RPMI_DMSO_A CBS138_RPMI_VORI_A CBS138_RPMI_FLUC_A
## 0 CAGL0A00165g 329 1277 953
## 1 CAGL0A00187g 541 1561 1755
## 2 CAGL0A00209g 2460 6237 7252
## 3 CAGL0A00231g 243 172 375
## 4 CAGL0A00253g 791 2009 2340
## 5 CAGL0A00275g 1295 2665 2876
## CBS138_RPMI_DMSO_B CBS138_RPMI_VORI_B CBS138_RPMI_FLUC_B CBS138_RPMI_DMSO_C
## 0 653 873 1124 422
## 1 1240 1434 1990 950
## 2 5037 6009 7581 3992
## 3 292 305 235 426
## 4 1336 2031 2071 1217
## 5 2133 2716 2933 1643
## CBS138_RPMI_VORI_C CBS138_RPMI_FLUC_C
## 0 436 3425
## 1 1120 7031
## 2 1345 18963
## 3 118 1572
## 4 395 8426
## 5 488 8892
head(counts_all_BG2) # BG2, raw
## Geneid BG2_RPMI_DMSO_A BG2_RPMI_VORI_A BG2_RPMI_FLUC_A BG2_RPMI_DMSO_B
## 0 GWK60_A00011 1 1 1 1
## 1 GWK60_A00033 100 169 93 116
## 2 GWK60_A00055 1490 2411 1982 1417
## 3 GWK60_A00077 4781 8038 6834 5852
## 4 GWK60_A00099 14463 20284 15228 15459
## 5 GWK60_A00121 2371 1412 816 2188
## BG2_RPMI_VORI_B BG2_RPMI_FLUC_B BG2_RPMI_DMSO_C BG2_RPMI_VORI_C
## 0 1 3 1 1
## 1 194 17 123 19
## 2 2422 304 1838 223
## 3 7534 1569 5704 1015
## 4 18965 2859 17357 1804
## 5 979 136 3000 63
## BG2_RPMI_FLUC_C
## 0 1
## 1 26
## 2 394
## 3 1236
## 4 2528
## 5 146
Results normalised to RPM are below:
head(counts_rpm_CBS138) # CBS138, normalised to RPM
## Geneid CBS138_RPMI_DMSO_A CBS138_RPMI_VORI_A CBS138_RPMI_FLUC_A
## 1 CAGL0A00165g 30.85212 65.75689 46.30258
## 2 CAGL0A00187g 50.73251 80.38097 85.26866
## 3 CAGL0A00209g 230.68756 321.16343 352.34663
## 4 CAGL0A00231g 22.78743 8.85684 18.21980
## 5 CAGL0A00253g 74.17637 103.44995 113.69155
## 6 CAGL0A00275g 121.43918 137.22953 139.73371
## CBS138_RPMI_DMSO_B CBS138_RPMI_VORI_B CBS138_RPMI_FLUC_B CBS138_RPMI_DMSO_C
## 1 38.03028 47.18167 51.79789 32.91856
## 2 72.21676 77.50116 91.70624 74.10577
## 3 293.35146 324.75904 349.35928 311.40026
## 4 17.00588 16.48386 10.82963 33.23059
## 5 77.80773 109.76629 95.43900 94.93340
## 6 124.22447 146.78741 135.16301 128.16398
## CBS138_RPMI_VORI_C CBS138_RPMI_FLUC_C
## 1 49.17631 46.07813
## 2 126.32448 94.59134
## 3 151.70216 255.11813
## 4 13.30919 21.14885
## 5 44.55194 113.35893
## 6 55.04138 119.62824
head(counts_rpm_BG2) # BG2, normalised to RPM
## Geneid BG2_RPMI_DMSO_A BG2_RPMI_VORI_A BG2_RPMI_FLUC_A BG2_RPMI_DMSO_B
## 1 GWK60_A00011 0.01411581 0.01238536 0.01580123 0.01571183
## 2 GWK60_A00033 1.41158064 2.09312569 1.46951436 1.82257193
## 3 GWK60_A00055 21.03255147 29.86110080 31.31803713 22.26365885
## 4 GWK60_A00077 67.48767020 99.55351647 107.98560331 91.94561156
## 5 GWK60_A00099 204.15690735 251.22462406 240.62112485 242.88913348
## 6 GWK60_A00121 33.46857687 17.48812705 12.89380338 34.37747746
## BG2_RPMI_VORI_B BG2_RPMI_FLUC_B BG2_RPMI_DMSO_C BG2_RPMI_VORI_C
## 1 0.0130404 0.2616564 0.01394962 0.1205643
## 2 2.5298376 1.4827197 1.71580380 2.2907212
## 3 31.5838485 26.5145175 25.63940963 26.8858332
## 4 98.2463727 136.8463092 79.56865753 122.3727387
## 5 247.3111836 249.3585711 242.12363055 217.4979513
## 6 12.7665515 11.8617578 41.84887317 7.5955493
## BG2_RPMI_FLUC_C
## 1 0.09884977
## 2 2.57009413
## 3 38.94681101
## 4 122.17832084
## 5 249.89222903
## 6 14.43206702
The genes between the two strains are matched using Reciprocal Blast Best Hits, which might not be entirely accurate given copy number variations etc. between the two species. I have used orthofinder https://doi.org/10.1186/s13059-019-1832-y to find orthologs, but I haven’t found a way to incorporate this into the analysis yet.
# Load reciprocal BLAST best hits
rbbh = read.csv("../data/rbbh/cbs138_bg2_rbbh2.csv")
# merge on CBS138 gene names,
counts_all_CBS138_rbbh = merge(counts_all_CBS138, rbbh,
by.x="Geneid", by.y="gene_id_cbs138")
# combine counts from common genes.
counts_all_joined = merge(counts_all_CBS138_rbbh,
dplyr::rename(counts_all_BG2, gene_id_bg2 = "Geneid"),
by = "gene_id_bg2")
head(counts_all_joined)
## gene_id_bg2 Geneid CBS138_RPMI_DMSO_A CBS138_RPMI_VORI_A
## 1 GWK60_A00011 CAGL0H10626g 119 153
## 2 GWK60_A00055 CAGL0A00165g 329 1277
## 3 GWK60_A00077 CAGL0A00187g 541 1561
## 4 GWK60_A00099 CAGL0A00209g 2460 6237
## 5 GWK60_A00121 CAGL0A00231g 243 172
## 6 GWK60_A00143 CAGL0A00253g 791 2009
## CBS138_RPMI_FLUC_A CBS138_RPMI_DMSO_B CBS138_RPMI_VORI_B CBS138_RPMI_FLUC_B
## 1 371 218 354 317
## 2 953 653 873 1124
## 3 1755 1240 1434 1990
## 4 7252 5037 6009 7581
## 5 375 292 305 235
## 6 2340 1336 2031 2071
## CBS138_RPMI_DMSO_C CBS138_RPMI_VORI_C CBS138_RPMI_FLUC_C BG2_RPMI_DMSO_A
## 1 238 205 2307 1
## 2 422 436 3425 1490
## 3 950 1120 7031 4781
## 4 3992 1345 18963 14463
## 5 426 118 1572 2371
## 6 1217 395 8426 5403
## BG2_RPMI_VORI_A BG2_RPMI_FLUC_A BG2_RPMI_DMSO_B BG2_RPMI_VORI_B
## 1 1 1 1 1
## 2 2411 1982 1417 2422
## 3 8038 6834 5852 7534
## 4 20284 15228 15459 18965
## 5 1412 816 2188 979
## 6 8274 5540 4938 6905
## BG2_RPMI_FLUC_B BG2_RPMI_DMSO_C BG2_RPMI_VORI_C BG2_RPMI_FLUC_C
## 1 3 1 1 1
## 2 304 1838 223 394
## 3 1569 5704 1015 1236
## 4 2859 17357 1804 2528
## 5 136 3000 63 146
## 6 984 5687 608 1004
For quality control, we check correlation between the samples (only within strain).
There is high correlation within untreated (DMSO) samples and treated (FCZ and VCZ), but no obvious differences between VCZ and FCZ.
This shows that the replicates are generally very well correlated, R > 0.96. However, there is more variability in replicate C, especially CBS138_RPMI_VORI_C.
We intially cluster the results on raw rpm, separately by strain. Again untreated samples cluster together and treated samples cluster together. Treated samples cluster in a way that doesn’t indicate any obvious sample mix-up.
CBS138_RPMI_VORI_C is an outlier.
Regularized logarithm, on both strains together joined by best-hit genes.
sample_sheet_all = here::here("data",
"sample_sheets",
"DGE_samplesheet_canGla2.txt") %>%
read_table()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## SampleID = col_double(),
## Code = col_character(),
## Medium = col_character(),
## Temp = col_character(),
## Time = col_character(),
## Rep = col_character(),
## Species = col_character(),
## Strain = col_character(),
## Condition = col_character()
## )
# samplesheet = py$experiment_data_both
rownames(sample_sheet_all) = sample_sheet_all$Code
## Warning: Setting row names on a tibble is deprecated.
print(sample_sheet_all)
## # A tibble: 18 × 9
## SampleID Code Medium Temp Time Rep Species Strain Condition
## * <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 86 CBS138_RPMI_DMSO_A RPMI_… 37C 4h A Candid… CBS138 CBS138_R…
## 2 87 CBS138_RPMI_VORI_A RPMI_… 37C 4h A Candid… CBS138 CBS138_R…
## 3 88 CBS138_RPMI_FLUC_A RPMI_… 37C 4h A Candid… CBS138 CBS138_R…
## 4 89 BG2_RPMI_DMSO_A RPMI_… 37C 4h A Candid… BG2 BG2_RPMI…
## 5 90 BG2_RPMI_VORI_A RPMI_… 37C 4h A Candid… BG2 BG2_RPMI…
## 6 91 BG2_RPMI_FLUC_A RPMI_… 37C 4h A Candid… BG2 BG2_RPMI…
## 7 92 CBS138_RPMI_DMSO_B RPMI_… 37C 4h B Candid… CBS138 CBS138_R…
## 8 93 CBS138_RPMI_VORI_B RPMI_… 37C 4h B Candid… CBS138 CBS138_R…
## 9 94 CBS138_RPMI_FLUC_B RPMI_… 37C 4h B Candid… CBS138 CBS138_R…
## 10 98 BG2_RPMI_DMSO_B RPMI_… 37C 4h B Candid… BG2 BG2_RPMI…
## 11 99 BG2_RPMI_VORI_B RPMI_… 37C 4h B Candid… BG2 BG2_RPMI…
## 12 100 BG2_RPMI_FLUC_B RPMI_… 37C 4h B Candid… BG2 BG2_RPMI…
## 13 95 CBS138_RPMI_DMSO_C RPMI_… 37C 4h C Candid… CBS138 CBS138_R…
## 14 96 CBS138_RPMI_VORI_C RPMI_… 37C 4h C Candid… CBS138 CBS138_R…
## 15 97 CBS138_RPMI_FLUC_C RPMI_… 37C 4h C Candid… CBS138 CBS138_R…
## 16 101 BG2_RPMI_DMSO_C RPMI_… 37C 4h C Candid… BG2 BG2_RPMI…
## 17 102 BG2_RPMI_VORI_C RPMI_… 37C 4h C Candid… BG2 BG2_RPMI…
## 18 103 BG2_RPMI_FLUC_C RPMI_… 37C 4h C Candid… BG2 BG2_RPMI…
dds_all <- DESeqDataSetFromMatrix(countData =
counts_all_joined %>%
dplyr::select(sample_sheet_all$Code) %>%
magrittr::set_rownames(counts_all_joined$Gene),
colData = sample_sheet_all,
design = ~ Code)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
rlog_all <- rlog(dds_all)
head(assay(rlog_all))
## CBS138_RPMI_DMSO_A CBS138_RPMI_VORI_A CBS138_RPMI_FLUC_A
## CAGL0H10626g 7.532662 7.046592 7.803483
## CAGL0A00165g 9.819161 10.247470 9.981287
## CAGL0A00187g 10.850525 11.007853 11.079555
## CAGL0A00209g 12.708113 12.783775 12.891339
## CAGL0A00231g 9.134426 8.283834 8.798710
## CAGL0A00253g 11.102017 11.177522 11.284519
## BG2_RPMI_DMSO_A BG2_RPMI_VORI_A BG2_RPMI_FLUC_A CBS138_RPMI_DMSO_B
## CAGL0H10626g 5.262604 5.237443 5.253129 7.578427
## CAGL0A00165g 9.575449 9.699059 9.706994 9.911436
## CAGL0A00187g 11.105937 11.269220 11.305368 11.037381
## CAGL0A00209g 12.660509 12.678386 12.613487 12.830194
## CAGL0A00231g 9.522984 8.829991 8.578657 8.830633
## CAGL0A00253g 11.176013 11.266810 11.107186 11.065763
## CBS138_RPMI_VORI_B CBS138_RPMI_FLUC_B BG2_RPMI_DMSO_B
## CAGL0H10626g 7.866795 7.604660 5.264334
## CAGL0A00165g 10.004430 10.069135 9.552677
## CAGL0A00187g 11.013891 11.131689 11.283900
## CAGL0A00209g 12.830300 12.878901 12.730468
## CAGL0A00231g 8.731257 8.427881 9.468714
## CAGL0A00253g 11.263050 11.135540 11.118469
## BG2_RPMI_VORI_B BG2_RPMI_FLUC_B CBS138_RPMI_DMSO_C
## CAGL0H10626g 5.239410 5.617187 7.879963
## CAGL0A00165g 9.723904 9.631222 9.758397
## CAGL0A00187g 11.239058 11.554708 11.019457
## CAGL0A00209g 12.646469 12.694132 12.838603
## CAGL0A00231g 8.581103 8.570093 9.332638
## CAGL0A00253g 11.140956 11.142942 11.186040
## CBS138_RPMI_VORI_C CBS138_RPMI_FLUC_C BG2_RPMI_DMSO_C
## CAGL0H10626g 8.178808 8.388479 5.254219
## CAGL0A00165g 10.166875 10.021087 9.659647
## CAGL0A00187g 11.533233 11.203325 11.169054
## CAGL0A00209g 12.350243 12.666644 12.730393
## CAGL0A00231g 8.690081 8.954762 9.647514
## CAGL0A00253g 10.688575 11.326460 11.138934
## BG2_RPMI_VORI_C BG2_RPMI_FLUC_C
## CAGL0H10626g 5.606251 5.549291
## CAGL0A00165g 9.623289 9.889308
## CAGL0A00187g 11.440342 11.419485
## CAGL0A00209g 12.563927 12.655815
## CAGL0A00231g 8.276823 8.672952
## CAGL0A00253g 10.997610 11.221235
rlog_all_df = as.data.frame(assay(rlog_all))
Normalized rlog gets at relative (not absolute) differences in expression between strains. Here we additionally filter for a minimal count of 100, to remove low-count noise.
These clustering analyses support:
We are not trying to get any more information out of this analysis, e.g. not trying to identify clusters of co-regulated genes, so there is no need to do more arranging the plots.
Calculate principal components from rlog.
# calculate principal components of rlog, after extracting from the dataset
pca_rlog <- rlog_all %>%
assay() %>%
t() %>%
prcomp()
# convert principal components to data frame
pcdf_rlog <- bind_cols(
as_tibble(colData(rlog_all)),
as_tibble(pca_rlog$x)
)
pcdf_rlog
## # A tibble: 18 × 28
## SampleID Code Medium Temp Time Rep Species Strain Condition sizeFactor
## <dbl> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 86 CBS138… RPMI_… 37C 4h A Candid… CBS138 CBS138_R… 0.368
## 2 87 CBS138… RPMI_… 37C 4h A Candid… CBS138 CBS138_R… 0.851
## 3 88 CBS138… RPMI_… 37C 4h A Candid… CBS138 CBS138_R… 0.870
## 4 89 BG2_RP… RPMI_… 37C 4h A Candid… BG2 BG2_RPMI… 2.29
## 5 90 BG2_RP… RPMI_… 37C 4h A Candid… BG2 BG2_RPMI… 3.14
## 6 91 BG2_RP… RPMI_… 37C 4h A Candid… BG2 BG2_RPMI… 2.56
## 7 92 CBS138… RPMI_… 37C 4h B Candid… CBS138 CBS138_R… 0.650
## 8 93 CBS138… RPMI_… 37C 4h B Candid… CBS138 CBS138_R… 0.775
## 9 94 CBS138… RPMI_… 37C 4h B Candid… CBS138 CBS138_R… 0.923
## 10 98 BG2_RP… RPMI_… 37C 4h B Candid… BG2 BG2_RPMI… 2.25
## 11 99 BG2_RP… RPMI_… 37C 4h B Candid… BG2 BG2_RPMI… 3.06
## 12 100 BG2_RP… RPMI_… 37C 4h B Candid… BG2 BG2_RPMI… 0.435
## 13 95 CBS138… RPMI_… 37C 4h C Candid… CBS138 CBS138_R… 0.510
## 14 96 CBS138… RPMI_… 37C 4h C Candid… CBS138 CBS138_R… 0.318
## 15 97 CBS138… RPMI_… 37C 4h C Candid… CBS138 CBS138_R… 2.98
## 16 101 BG2_RP… RPMI_… 37C 4h C Candid… BG2 BG2_RPMI… 2.52
## 17 102 BG2_RP… RPMI_… 37C 4h C Candid… BG2 BG2_RPMI… 0.323
## 18 103 BG2_RP… RPMI_… 37C 4h C Candid… BG2 BG2_RPMI… 0.403
## # ℹ 18 more variables: PC1 <dbl>, PC2 <dbl>, PC3 <dbl>, PC4 <dbl>, PC5 <dbl>,
## # PC6 <dbl>, PC7 <dbl>, PC8 <dbl>, PC9 <dbl>, PC10 <dbl>, PC11 <dbl>,
## # PC12 <dbl>, PC13 <dbl>, PC14 <dbl>, PC15 <dbl>, PC16 <dbl>, PC17 <dbl>,
## # PC18 <dbl>
propvar_rlog_df <- tibble(
PC = seq.int(1L, ncol(pca_rlog$x) ),
tot_var = pca_rlog$sdev^2,
prop_var = tot_var/sum(tot_var)
)
propvar_rlog_df
## # A tibble: 18 × 3
## PC tot_var prop_var
## <int> <dbl> <dbl>
## 1 1 1.68e+ 2 3.58e- 1
## 2 2 1.55e+ 2 3.31e- 1
## 3 3 6.87e+ 1 1.46e- 1
## 4 4 1.82e+ 1 3.88e- 2
## 5 5 1.60e+ 1 3.40e- 2
## 6 6 1.26e+ 1 2.68e- 2
## 7 7 5.75e+ 0 1.22e- 2
## 8 8 4.79e+ 0 1.02e- 2
## 9 9 3.80e+ 0 8.08e- 3
## 10 10 3.32e+ 0 7.07e- 3
## 11 11 3.07e+ 0 6.54e- 3
## 12 12 2.48e+ 0 5.28e- 3
## 13 13 2.09e+ 0 4.45e- 3
## 14 14 1.78e+ 0 3.79e- 3
## 15 15 1.29e+ 0 2.75e- 3
## 16 16 1.14e+ 0 2.44e- 3
## 17 17 1.04e+ 0 2.21e- 3
## 18 18 1.07e-26 2.29e-29
plot_percentvar_rlog <-
ggplot(data = propvar_rlog_df,
aes(x = PC, y = prop_var)) +
geom_col(fill = "skyblue3") +
scale_x_continuous("principal component",
limits = c(0.4,10.6),
breaks = 1L:10L,
expand = c(0,0)) +
scale_y_continuous("prop. of variance", expand = c(0,0))
plot_percentvar_rlog
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_col()`).
scales_PCA <-
list(
scale_colour_manual("Strain, Treatment",
values = c("CBS138_RPMI_DMSO" = "grey20",
"CBS138_RPMI_FLUC" = "purple3",
"CBS138_RPMI_VORI" = "green4",
"BG2_RPMI_DMSO" = "grey20",
"BG2_RPMI_FLUC" = "purple3",
"BG2_RPMI_VORI" = "green4"),
labels = c("CBS138_RPMI_DMSO" = "CBS138, DMSO",
"CBS138_RPMI_FLUC" = "CBS138, FCZ",
"CBS138_RPMI_VORI" = "CBS138, VCZ",
"BG2_RPMI_DMSO" = "BG2, DMSO",
"BG2_RPMI_FLUC" = "BG2, FCZ",
"BG2_RPMI_VORI" = "BG2, VCZ")),
scale_shape_manual("Strain, Treatment",
values = c("CBS138_RPMI_DMSO" = 16,
"CBS138_RPMI_FLUC" = 16,
"CBS138_RPMI_VORI" = 16,
"BG2_RPMI_DMSO" = 17,
"BG2_RPMI_FLUC" = 17,
"BG2_RPMI_VORI" = 17),
labels = c("CBS138_RPMI_DMSO" = "CBS138, DMSO",
"CBS138_RPMI_FLUC" = "CBS138, FCZ",
"CBS138_RPMI_VORI" = "CBS138, VCZ",
"BG2_RPMI_DMSO" = "BG2, DMSO",
"BG2_RPMI_FLUC" = "BG2, FCZ",
"BG2_RPMI_VORI" = "BG2, VCZ"))
)
pc12 = ggplot(data = pcdf_rlog,
aes(colour = Condition, shape = Condition)
) +
geom_hline(yintercept = 0, colour = "grey90") +
geom_vline(xintercept = 0, colour = "grey90") +
geom_point(aes(x = PC1, y = PC2), size = 2) +
theme(legend.box = "horizontal") +
scales_PCA
legend = get_legend(pc12)
## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.
pc12
pc32 = ggplot(data = pcdf_rlog,
aes(colour = Condition, shape = Condition)
) +
geom_hline(yintercept = 0, colour = "grey90") +
geom_vline(xintercept = 0, colour = "grey90") +
geom_point(aes(x = PC3, y = PC2), size = 2) +
scales_PCA
pc32
ggplot(data = pcdf_rlog,
aes(colour = Condition, shape = Rep)
) +
geom_point(aes(x = PC3, y = PC2)) +
scale_shape_identity()
# TODO: change styling to include shapes: circles squares triangles, maybe add Vs and Fs
Conclusions:
In PC1 vs PC2 there are 4 clear clusters: corresponding to treated and untreated strains BG2 and CBS138. In PC3 they look more similar.
The outlier: CBS138_RPMI_VORI_C (light purple) clusters with other bio reps in PC1-PC2, but not in PC3. After discussion, we decided that something was not quite right in that replicate.
Differential Gene Expression analysis was performed using DESeq2. Genes with adjusted p-value < 0.05 and at least 2x change are considered differentially expressed here. The upregulated and downregulated genes are saved in ../data/deseq2
Conclude: the single outlying sample CBS138_RPMI_VORI_C
has minimal effect on the substantial results. Although it changes the
genelists over the cutoff, it doesn’t change much most of the fold
changes and p-values.
Again this does not substantially change outcome: 7 DE genes when the sample is excluded, instead of zero.
How many genes were differentially expressed in each condition? [TODO: make it a figure]
CBS138:
## [1] "Upregulated CBS138 FCZ vs DMSO: 321"
## [1] "Downregulated CBS138 FCZ vs DMSO: 409"
## [1] "Upregulated in CBS138 VCZ vs DMSO: 338"
## [1] "Downregulated in CBS138 VCZ vs DMSO: 353"
## [1] "Upregulated in CBS138 VCZ vs FCZ: 0"
## [1] "Downregulated in CBS138 VCZ vs FCZ: 0"
BG2:
## [1] "Upregulated in BG2 FCZ vs DMSO: 271"
## [1] "Downregulated in BG2 FCZ vs DMSO: 450"
## [1] "Upregulated in BG2 VCZ vs DMSO: 234"
## [1] "Downregulated in BG2 VCZ vs DMSO: 328"
## [1] "Upregulated in BG2 VCZ vs FCZ: 0"
## [1] "Downregulated in BG2 VCZ vs FCZ: 0"
Conclusions:
Using curated orthologs between C.glabrata CBS138 and S.cerevisiae (downloaded from candidagenome.org), I identified standard and systematic S. cerevisiae names for C.glabrata genes up- and down-regulated in FCZ vs DMSO. There is no need to do repeat it in VCZ, because there are no significant DEG between these two conditions.
Orthologs are also identified between C.glabrata strains using Reciprocal Blast Best Hits.
Here I look at overlap between genes upregulated/downregulated in FCZ treatment between CBS138 and BG2. This includes:
## [1] "Number of FCZ upregulated genes in CBS138: 321 that have a homolog in BG2: 314"
## [1] "Number of FCZ downregulated genes in CBS138: 409 that have a homolog in BG2: 405"
## [1] "Number of FCZ upregulated genes in BG2: 271 that have a homolog in CBS138: 258"
## [1] "Number of FCZ downregulated genes in BG2: 450 that have a homolog in CBS138: 443"
## [1] "Number of FCZ upregulated genes overlapping between CBS138 and BG2: 169"
## [1] "Number of FCZ downregulated genes overlapping between CBS138 and BG2: 269"
Conclusions:
Based on this, I used GO term enrichment (candidagenome.org) of up- and downregulated genes in CBS138, BG2 and common to both strains.
The results are in ../data/go/ folder.
Both strains up (selected):
Only CBS138 up:
BG2 only up (selected):
Conclusions:
Both strains down (selected):
CBS138 only:
BG2 only:
Conclusions:
Using data from two publications, both in CBS138:
I looked at differential expression of two transcription factor targets involved in antifungal resistance. This part of the analysis in CBS138 only.
This is not enrichment analysis, just search for overlapping genes.
For comparison, I’m also showing targets of S. cerevisiae homologs of these two genes (targets downloaded from SGD).
## [1] "Number of PDR1 targets in C.glabrata: 25, number of upregulated genes: 261, number of downregulated genes: 344"
## [1] "Intersection UP:"
## [1] "ATF2" "YOR1" "BAG7" "RTA1" "LAF1"
## [1] "Intersection DOWN:"
## [1] "PDR16" "RSB1"
## [1] "Number of PDR1 targets in S. cerevisiae: 44, , number of upregulated genes: 261, number of downregulated genes: 344"
## [1] "Intersection UP:"
## [1] "GSC2" "VPS4"
## [1] "Intersection DOWN:"
## [1] "RPC53" "RPO26"
## [1] "Number of UPC2A targets in C. glabrata: 237, , number of upregulated genes: 261, number of downregulated genes: 344"
## [1] "Intersection UP:"
## [1] "ERG1" "YOR1" "YOR1" "YOR1" "YOR1" "YNR014W" "YER158C"
## [8] "HEM13" "ADP1"
## [1] "Intersection DOWN:"
## [1] "PMA1" "PMA1" "YCL048W-A" "DSE3" "YJR115W" "TIR2"
## [7] "FUI1" "FUI1" "INA1" "INA1" "CWP1" "CWP1"
## [13] "CWP2" "CWP2" "SRP40" "PTR2" "TPO1" "YDL211C"
## [19] "SNG1" "MRH1" "RSB1" "RSB1" "GTT3" "GTT3"
## [25] "TOS1" "TOS1"
## [1] "Number of UPC2 targets in S. cerevisiae: 16, , number of upregulated genes: 261, number of downregulated genes: 344"
## [1] "Intersection UP:"
## [1] "PRR2" "ERG25" "ERG1" "ERG11" "ERG11" "NCP1" "ERG7" "ERG3" "ERG3"
## [10] "ERG5" "ERG2"
## [1] "Intersection DOWN:"
## [1] "FHN1"
Conclusions:
Report number of differentially expressed genes in CBS138 vs BG2:
## [1] "DMSO upregulated: 194, downregulated: 265"
## [1] "FCZ upregulated: 184, downregulated: 242"
## [1] "VCZ upregulated: 187, downregulated: 256"
DMSO upregulated in CBS138 (selected):
DMSO downregulated in CBS138 (selected):
FCZ upregulated in CBS138:
FCZ downregulated in CBS138 (selected):
VCZ upregulated in CBS138:
VCZ downregulated in CBS138:
The above conclusions for pairwise comparisons suggest that we should compare:
We’re looking for differential strain responses to drug. Here, “up” means that azoles induce more change in transcript abundance in CBS138 as compared to BG2. Again, treating both drugs as biological replicates, due to the small differences in response to the two diferent drugs.
dds_drugstrain <-
DESeqDataSetFromMatrix(
countData = counts_all_joined %>%
dplyr::select(sample_sheet_all_drugnodrug$Code) %>%
magrittr::set_rownames(counts_all_joined$Gene),
colData = sample_sheet_all_drugnodrug,
design = ~ Strain * Drug) %>%
DESeq()
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds_drugstrain)
## [1] "Intercept" "Strain_CBS138_vs_BG2" "DrugTRUE"
## [4] "StrainCBS138.DrugTRUE"
deseq_df_drugstrain <-
deseq_df_transform(dds_drugstrain,
termstokeep = "StrainCBS138.DrugTRUE")
deseq_df_drugstrain
## # A tibble: 5,124 × 5
## gene baseMean log2FC stderror padj
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAGL0H10626g 220. -0.916 1.18 0.986
## 2 CAGL0A00165g 939. 0.220 0.192 0.986
## 3 CAGL0A00187g 2413. 0.0531 0.279 0.991
## 4 CAGL0A00209g 6755. 0.0379 0.237 0.991
## 5 CAGL0A00231g 515. 0.865 0.399 0.546
## 6 CAGL0A00253g 2282. 0.0776 0.273 0.991
## 7 CAGL0A00275g 2973. 0.0260 0.288 0.993
## 8 CAGL0A00297g 1698. 0.321 0.222 0.899
## 9 CAGL0A00319g 1804. 0.322 0.152 0.583
## 10 CAGL0A00341g 654. 0.272 0.417 0.986
## # ℹ 5,114 more rows
DEGdf_up2x_FDR5_drugstrain = get_upregulated_genes(deseq_df_drugstrain)
DEGdf_down2x_FDR5_drugstrain = get_downregulated_genes(deseq_df_drugstrain)
print(DEGdf_up2x_FDR5_drugstrain, n = 20)
## # A tibble: 9 × 5
## gene baseMean log2FC stderror padj
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAGL0J02530g 466. 2.07 0.376 0.0000599
## 2 CAGL0L06776g 1896. 1.37 0.274 0.000471
## 3 CAGL0I09856g 1624. 1.35 0.368 0.0246
## 4 CAGL0K04719g 2367. 1.29 0.348 0.0238
## 5 CAGL0M06347g 2086. 1.24 0.235 0.000134
## 6 CAGL0L08426g 1163. 1.23 0.331 0.0233
## 7 CAGL0K10824g 1318. 1.18 0.289 0.00899
## 8 CAGL0M08800g 1528. 1.18 0.327 0.0290
## 9 CAGL0M03839g 21921. 1.12 0.313 0.0325
print(DEGdf_down2x_FDR5_drugstrain, n = 20)
## # A tibble: 22 × 5
## gene baseMean log2FC stderror padj
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAGL0F06875g 11158. -2.81 0.398 0.00000000921
## 2 CAGL0J06402g 2664. -2.34 0.472 0.000508
## 3 CAGL0M01892g 39.9 -1.90 0.510 0.0233
## 4 CAGL0G02585g 8260. -1.79 0.385 0.00170
## 5 CAGL0F02431g 5599. -1.77 0.394 0.00288
## 6 CAGL0F04917g 114. -1.73 0.441 0.0151
## 7 CAGL0I09592g 938. -1.72 0.450 0.0203
## 8 CAGL0K07788g 14241. -1.67 0.431 0.0167
## 9 CAGL0C03872g 10721. -1.67 0.399 0.00763
## 10 CAGL0I08987g 949. -1.47 0.402 0.0253
## 11 CAGL0L02079g 7782. -1.47 0.322 0.00224
## 12 CAGL0K11616g 5367. -1.44 0.340 0.00697
## 13 CAGL0J09240g 9171. -1.44 0.354 0.0100
## 14 CAGL0E06688g 683. -1.37 0.341 0.0107
## 15 CAGL0C03443g 15756. -1.34 0.297 0.00275
## 16 CAGL0B01507g 734. -1.27 0.233 0.0000599
## 17 CAGL0G04807g 1090. -1.21 0.321 0.0222
## 18 CAGL0L01551g 1556. -1.18 0.287 0.00899
## 19 CAGL0I07711g 387. -1.16 0.285 0.0100
## 20 CAGL0G08668g 2336. -1.14 0.263 0.00538
## # ℹ 2 more rows
p_drugstrain = volcano_plot(deseq_df_drugstrain, DEGdf_up2x_FDR5_drugstrain, DEGdf_down2x_FDR5_drugstrain,
ylimits = c(0,5),
xlabel = "log2 fold-change\n← more change in BG2 more change in CBS138 →") +
ggtitle("Azole vs DMSO, strain differences")
p_drugstrain
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
write.csv(deseq_df_drugstrain, "../data/deseq2/DrugStrainInteractions/DrugStrain_CBS138_vs_BG2_allDESeq.csv")
write.csv(DEGdf_up2x_FDR5_drugstrain, "../data/deseq2/DrugStrainInteractions/DrugStrain_CBS138_vs_BG2_up2x_FDR5.csv")
writeLines(DEGdf_up2x_FDR5_drugstrain$gene, "../data/deseq2/DrugStrainInteractions/DrugStrain_CBS138_vs_BG2_up2x_FDR5_names.txt")
write.csv(DEGdf_down2x_FDR5_drugstrain, "../data/deseq2/DrugStrainInteractions/DrugStrain_CBS138_vs_BG2_down2x_FDR5.csv")
writeLines(DEGdf_down2x_FDR5_drugstrain$gene, "../data/deseq2/DrugStrainInteractions/DrugStrain_CBS138_vs_BG2_down2x_FDR5_names.txt")
Highlighting calcium transporters MID1, ECM7, CCH1.
# Warning! This code chunk is based on manually reading in a file with gene names only of calculated deseq genes. It would be better to do a merge on all Cg gene names.
calcium_transporters <-
tribble(~gene, ~name,
"CAGL0B02211g", "CCH1",
"CAGL0M03597g", "MID1",
"CAGL0M00748g", "ECM7")
favourite_TFs <-
tribble(~gene, ~name,
"CAGL0M08800g", "YAP6",
"CAGL0L06776g", "GAT3/4")
drugstrain_down_names <-
read_tsv("../data/deseq2/DrugStrainInteractions/DrugStrain_CBS138_vs_BG2_down2x_FDR5_fungidbtable.txt") %>%
select(gene, name) %>%
filter(!is.na(name))
## Rows: 22 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): gene, description, name
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
p_summary_volcanos_labels =
plot_grid(
p_cbd +
geom_label_repel(data = deseq_df_cbd %>%
right_join(calcium_transporters, by = "gene"),
aes(label = name),
min.segment.length = 0, max.overlaps = 20,
box.padding = 0.1, label.padding = 0.1),
p_drugnodrug_bc +
geom_label_repel(data = deseq_df_drugnodrug_bc %>%
right_join(calcium_transporters, by = "gene"),
aes(label = name),
min.segment.length = 0, max.overlaps = 20,
box.padding = 0.1, label.padding = 0.1),
p_drugstrain +
geom_label_repel(data = deseq_df_drugstrain %>%
right_join(
bind_rows(calcium_transporters,
favourite_TFs,
drugstrain_down_names),
by = "gene"),
aes(label = name),
min.segment.length = 0, max.overlaps = 20,
box.padding = 0.1, label.padding = 0.1),
ncol = 1, nrow = 3)
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
p_summary_volcanos_labels
ggsave("../figures/summary_volcano_plots_genelabels.png",
plot = p_summary_volcanos_labels,
width = 7.5, height = 9)
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
# Generate 3 sets of 200 words
set1 <- paste(rep("word_" , 200) , sample(c(1:1000) , 200 , replace=F) , sep="")
set2 <- paste(rep("word_" , 200) , sample(c(1:1000) , 200 , replace=F) , sep="")
set3 <- paste(rep("word_" , 200) , sample(c(1:1000) , 200 , replace=F) , sep="")
set.seed(20190708)
genes <- paste("gene",1:1000,sep="")
x <- list(
A = sample(genes,300),
B = sample(genes,525),
C = sample(genes,440),
D = sample(genes,350)
)
# Chart
venn.diagram(
x = list(set1, set2, set3),
category.names = c("Set 1" , "Set 2 " , "Set 3"),
filename = NULL,
output=TRUE
)
## (polygon[GRID.polygon.8475], polygon[GRID.polygon.8476], polygon[GRID.polygon.8477], polygon[GRID.polygon.8478], polygon[GRID.polygon.8479], polygon[GRID.polygon.8480], text[GRID.text.8481], text[GRID.text.8482], text[GRID.text.8483], text[GRID.text.8484], text[GRID.text.8485], text[GRID.text.8486], text[GRID.text.8487], text[GRID.text.8488], text[GRID.text.8489], text[GRID.text.8490])
display_venn <- function(x, ...){
library(VennDiagram)
grid.newpage()
venn_object <- venn.diagram(x, filename = NULL, ...)
grid.draw(venn_object)
}
display_venn(
x,
category.names = c("Set 1" , "Set 2 " , "Set 3", "Set 4"),
fill = c("#999999", "#E69F00", "#56B4E9", "#009E73")
)
display_venn(x)
library("ggVennDiagram")
##
## Attaching package: 'ggVennDiagram'
## The following object is masked from 'package:tidyr':
##
## unite
ggVennDiagram(x, label_alpha = 0)
cf_up = as.vector(ortho_cb_c_up$gene)
cbd_up = as.vector(DEGdf_up2x_FDR5_cbd$gene)
cbf_up = as.vector(DEGdf_up2x_FDR5_cbf$gene)
bf_up = as.vector(ortho_cb_b_up$gene_id_cbs138)
y <- list(
CBS138_FLUC = cf_up,
strain_DMSO = cbd_up,
strain_FLUC = cbf_up,
BG2_FLUC = bf_up
)
display_venn(
y,
fill = c("#999999", "#E69F00", "#56B4E9", "#009E73")
)
print(head(DEGdf_up2x_FDR5_cbd))
## # A tibble: 6 × 5
## gene baseMean log2FC stderror padj
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAGL0K00110g 13966. 12.5 0.809 3.70e- 51
## 2 CAGL0C00110g 4327. 10.3 0.274 1.58e-303
## 3 CAGL0H10626g 203. 9.79 0.873 4.30e- 27
## 4 CAGL0E00187g 550. 8.81 0.816 3.86e- 25
## 5 CAGL0E06666g 2836. 8.68 0.823 6.27e- 24
## 6 CAGL0K13024g 1275. 8.65 0.334 9.32e-145
import pandas as pd
plt.rcParams['svg.fonttype'] = 'none'
rlog_all_p = r.rlog_all_df
rlog_all_p = rlog_all_p.reset_index().rename(columns={'index': 'Geneid'})
filamentous_genes = ["CAGL0H02145g", "CAGL0I01254g", "CAGL0I09856g", "CAGL0K04169g", "CAGL0M03663g", "CAGL0M09042g"]
cell_adhesion_genes = ["CAGL0K00110g", "CAGL0A01584g", "CAGL0I09856g", "CAGL0K12078g", "CAGL0I00220g", "CAGL0E06688g", "CAGL0C00110g"]
s = rlog_all_p[rlog_all_p["Geneid"].isin(cell_adhesion_genes)]
t = s.set_index("Geneid")
#plt.figure(figsize=(10, 10))
#plt.subplots_adjust(left=0.5, right=1.5, bottom=4.0)
g = sns.clustermap(t, cmap='vlag', method="single", annot_kws={"size": 7})
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0) # For y axis
plt.show()
#plt.savefig("/Users/s1964600/Work/random_figures/new_svg/cpac_scer_deg.svg")
plt.close()
Main manuscript RNA-seq composite plot.
Load gene info from FungiDB with some of the important gene names.
CBS138_geneinfo <-
read_tsv("../data/genomes/CBS138_allgenes.txt",
col_names = c("Geneid", "Transcriptid","GenomicLocation", "Description", "Name"),
skip = 1,
na = c("", "NA", "N/A"))
## Rows: 5615 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Geneid, Transcriptid, GenomicLocation, Description, Name
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
BG2_geneinfo <-
read_tsv("../data/genomes/BG2_allgenes.txt",
col_names = c("Geneid", "Transcriptid","GenomicLocation", "Description"),
skip = 1,
na = c("", "NA", "N/A")) %>%
mutate(Name = stringr::str_extract(Description, "^[A-Z]+[0-9]+"))
## Rows: 5481 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Geneid, Transcriptid, GenomicLocation, Description
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.